static const char* boundSearchKernelsCL= \
"/*\n"
"		2011 Takahiro Harada\n"
"*/\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"\n"
"typedef struct\n"
"{\n"
"	u32 m_key; \n"
"	u32 m_value;\n"
"}SortData;\n"
"\n"
"\n"
"\n"
"typedef struct\n"
"{\n"
"	u32 m_nSrc;\n"
"	u32 m_nDst;\n"
"	u32 m_padding[2];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
"					ConstBuffer cb)\n"
"{\n"
"	int gIdx = GET_GLOBAL_IDX;\n"
"	u32 nSrc = cb.m_nSrc;\n"
"	u32 nDst = cb.m_nDst;\n"
"\n"
"	if( gIdx < nSrc )\n"
"	{\n"
"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
"\n"
"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
"\n"
"		if( iData.m_key != jData.m_key )\n"
"		{\n"
"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
"			u32 k = jData.m_key;\n"
"			{\n"
"				dst[k] = gIdx;\n"
"			}\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
"					ConstBuffer cb)\n"
"{\n"
"	int gIdx = GET_GLOBAL_IDX;\n"
"	u32 nSrc = cb.m_nSrc;\n"
"	u32 nDst = cb.m_nDst;\n"
"\n"
"	if( gIdx < nSrc+1 )\n"
"	{\n"
"		SortData first; first.m_key = 0; first.m_value = 0;\n"
"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
"\n"
"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
"\n"
"		if( iData.m_key != jData.m_key )\n"
"		{\n"
"//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)\n"
"			u32 k = iData.m_key;\n"
"			{\n"
"				dst[k] = gIdx;\n"
"			}\n"
"		}\n"
"	}\n"
"}\n"
"\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
"					ConstBuffer cb)\n"
"{\n"
"	int gIdx = GET_GLOBAL_IDX;\n"
"	u32 nSrc = cb.m_nSrc;\n"
"	u32 nDst = cb.m_nDst;\n"
"\n"
"	if( gIdx < nDst )\n"
"	{\n"
"		C[gIdx] = A[gIdx] - B[gIdx];\n"
"	}\n"
"}\n"
"\n"
;
